fuente: https://github.com/DiegoKoz/discursos_presidenciales

library(glue)

Attaching package: ‘glue’

The following object is masked from ‘package:dplyr’:

    collapse
df <- read_rds('data/discursos_presidenciales.rds')

df <- df %>% 
  mutate(texto = tolower(texto),
         texto = stri_trans_general(texto, "Latin-ASCII"),
         texto = str_trim(texto,side = 'both'),
         texto = str_replace_all(texto,'\t',' '),
         texto = str_replace_all(texto,'\n',' '),
         texto = str_replace_all(texto,'\r',' '),
         texto = str_replace_all(texto,'[[:punct:]]',' '),
         texto = str_replace_all(texto,'\\d','NUM'),
         texto = str_replace_all(texto,'(NUM)+','NUM'),
         texto = str_replace_all(texto,"\\s+", " "))

palabras_comunes <- read_csv(file = 'data/r_words.txt',col_names = F)
Parsed with column specification:
cols(
  X1 = col_character()
)
palabras_comunes <-stri_trans_general(palabras_comunes$X1, "Latin-ASCII")
palabras_comunes <- unique(palabras_comunes)

texto <- df$texto

Corpus = VCorpus(VectorSource(texto))
Corpus = tm_map(Corpus, removeWords, c(stopwords(kind = "es"),palabras_comunes))
# Corpus <- tm_map(Corpus, stemDocument, language = "spanish") # Corpus  

dtm <- DocumentTermMatrix(Corpus)
tm::nTerms(dtm)
[1] 19409
#elimino los docuemntos vacios
rowTotals <- apply(dtm , 1, sum)
nDocs(dtm)
[1] 603
dtm   <- dtm[rowTotals> 0, ]
nDocs(dtm)
[1] 602
lda_fit
A LDA_Gibbs topic model with 10 topics.
Terms <- terms(lda_fit, 10)
Terms
      Topic 1         Topic 2          Topic 3      Topic 4           Topic 5        Topic 6        Topic 7       Topic 8      Topic 9     
 [1,] "labor"         "organizaciones" "presidente" "pami"            "mechita"      "club"         "periodista"  "mar"        "elegir"    
 [2,] "pone"          "medico"         "paises"     "tecnopolis"      "julio"        "competir"     "justicia"    "admiracion" "alumna"    
 [3,] "fragata"       "afectando"      "mundo"      "declaracion"     "capaces"      "saladillo"    "informacion" "alguno"     "libremente"
 [4,] "satelite"      "complejas"      "macri"      "anteriores"      "nacion"       "clubes"       "congreso"    "vecina"     "recibirnos"
 [5,] "ayuden"        "habian"         "pais"       "deporte"         "enfrentamos"  "anteriores"   "provincias"  "colectivo"  "alumno"    
 [6,] "encabezar"     "ratificando"    "num"        "enormemente"     "enormes"      "construccion" "prensa"      "ensena"     "podes"     
 [7,] "entendimiento" "tiempos"        "ser"        "maravilloso"     "expresa"      "demostrarles" "publicos"    "fronteras"  "deporte"   
 [8,] "funcionar"     "trabajaba"      "desarrollo" "profesionalismo" "sentirnos"    "jubilados"    "respecto"    "colegio"    "dio"       
 [9,] "jesus"         "trabajemos"     "primer"     "vuelvo"          "acompanarnos" "llenos"       "reforma"     "comunidad"  "liderar"   
[10,] "juicio"        "cabe"           "anos"       "conoci"          "continente"   "resignar"     "fiscal"      "guerra"     "gana"      
      Topic 10    
 [1,] "num"       
 [2,] "argentinos"
 [3,] "pais"      
 [4,] "trabajo"   
 [5,] "anos"      
 [6,] "aca"       
 [7,] "mundo"     
 [8,] "ser"       
 [9,] "verdad"    
[10,] "juntos"    

Visualizacion

topicmodels_json_ldavis <- function(fitted, dtm){
    svd_tsne <- function(x) tsne(svd(x)$u)

    # Find required quantities
    phi <- as.matrix(posterior(fitted)$terms)
    theta <- as.matrix(posterior(fitted)$topics)
    vocab <- colnames(phi)
    term_freq <- slam::col_sums(dtm)

    # Convert to json
    json_lda <- LDAvis::createJSON(phi = phi, theta = theta,
                            vocab = vocab,
                            mds.method = svd_tsne,
                            plot.opts = list(xlab="", ylab=""),
                            doc.length = as.vector(table(dtm$i)),
                            term.frequency = term_freq)

    return(json_lda)
}
json_res <- topicmodels_json_ldavis(lda_fit, dtm)
sigma summary: Min. : 33554432 |1st Qu. : 33554432 |Median : 33554432 |Mean : 33554432 |3rd Qu. : 33554432 |Max. : 33554432 |
Epoch: Iteration #100 error is: 14.1744077047285
Epoch: Iteration #200 error is: 0.6642175441671
Epoch: Iteration #300 error is: 0.375647353159243
Epoch: Iteration #400 error is: 0.276097083111349
Epoch: Iteration #500 error is: 0.2622322268503
Epoch: Iteration #600 error is: 0.253897306321437
Epoch: Iteration #700 error is: 0.252719375064657
Epoch: Iteration #800 error is: 0.252652072110335
Epoch: Iteration #900 error is: 0.252626857162804
Epoch: Iteration #1000 error is: 0.252588385221556
serVis(json_res)
createTcpServer: address already in use
To stop the server, run servr::daemon_stop(2) or restart your R session
Serving the directory /tmp/RtmpVR2iwu/file15a266d88ad3 at http://127.0.0.1:3222

LS0tCnRpdGxlOiAiRGlzY3Vyc29zIFByZXNpZGVuY2lhbGVzIgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCgpmdWVudGU6IGh0dHBzOi8vZ2l0aHViLmNvbS9EaWVnb0tvei9kaXNjdXJzb3NfcHJlc2lkZW5jaWFsZXMgCgpgYGB7ciBzZXR1cH0KbGlicmFyeSh0aWR5dmVyc2UpCmxpYnJhcnkoZ2x1ZSkKbGlicmFyeSh0bSkKbGlicmFyeSh0b3BpY21vZGVscykKbGlicmFyeSh0aWR5dGV4dCkKbGlicmFyeShzdHJpbmdpKQpsaWJyYXJ5KExEQXZpcykKbGlicmFyeShzbGFtKQpsaWJyYXJ5KHRzbmUpCmxpYnJhcnkobHVicmlkYXRlKQpgYGAKCmBgYHtyfQpkZiA8LSByZWFkX3JkcygnZGF0YS9kaXNjdXJzb3NfcHJlc2lkZW5jaWFsZXMucmRzJykKCmRmIDwtIGRmICU+JSAKICBtdXRhdGUodGV4dG8gPSB0b2xvd2VyKHRleHRvKSwKICAgICAgICAgdGV4dG8gPSBzdHJpX3RyYW5zX2dlbmVyYWwodGV4dG8sICJMYXRpbi1BU0NJSSIpLAogICAgICAgICB0ZXh0byA9IHN0cl90cmltKHRleHRvLHNpZGUgPSAnYm90aCcpLAogICAgICAgICB0ZXh0byA9IHN0cl9yZXBsYWNlX2FsbCh0ZXh0bywnXHQnLCcgJyksCiAgICAgICAgIHRleHRvID0gc3RyX3JlcGxhY2VfYWxsKHRleHRvLCdcbicsJyAnKSwKICAgICAgICAgdGV4dG8gPSBzdHJfcmVwbGFjZV9hbGwodGV4dG8sJ1xyJywnICcpLAogICAgICAgICB0ZXh0byA9IHN0cl9yZXBsYWNlX2FsbCh0ZXh0bywnW1s6cHVuY3Q6XV0nLCcgJyksCiAgICAgICAgIHRleHRvID0gc3RyX3JlcGxhY2VfYWxsKHRleHRvLCdcXGQnLCdOVU0nKSwKICAgICAgICAgdGV4dG8gPSBzdHJfcmVwbGFjZV9hbGwodGV4dG8sJyhOVU0pKycsJ05VTScpLAogICAgICAgICB0ZXh0byA9IHN0cl9yZXBsYWNlX2FsbCh0ZXh0bywiXFxzKyIsICIgIikpCgoKYGBgCgoKYGBge3J9CgpwYWxhYnJhc19jb211bmVzIDwtIHJlYWRfY3N2KGZpbGUgPSAnZGF0YS9yX3dvcmRzLnR4dCcsY29sX25hbWVzID0gRikKCnBhbGFicmFzX2NvbXVuZXMgPC1zdHJpX3RyYW5zX2dlbmVyYWwocGFsYWJyYXNfY29tdW5lcyRYMSwgIkxhdGluLUFTQ0lJIikKcGFsYWJyYXNfY29tdW5lcyA8LSB1bmlxdWUocGFsYWJyYXNfY29tdW5lcykKCnRleHRvIDwtIGRmJHRleHRvCgpDb3JwdXMgPSBWQ29ycHVzKFZlY3RvclNvdXJjZSh0ZXh0bykpCgpzdG9wX3dvcmRzIDwtIHVuaXF1ZShzdHJpX3RyYW5zX2dlbmVyYWwoc3RvcHdvcmRzKGtpbmQgPSAiZXMiKSwgIkxhdGluLUFTQ0lJIikpICMgbGUgdGVuZ28gcXVlIGhhY2VyIGxhIG1pc21hIHRyYW5zZm9ybWFjaW9uIHF1ZSBhbCB0ZXh0bwoKQ29ycHVzID0gdG1fbWFwKENvcnB1cywgcmVtb3ZlV29yZHMsIGMoc3RvcF93b3JkcyxwYWxhYnJhc19jb211bmVzKSkKIyBDb3JwdXMgPC0gdG1fbWFwKENvcnB1cywgc3RlbURvY3VtZW50LCBsYW5ndWFnZSA9ICJzcGFuaXNoIikgIyBDb3JwdXMgIAoKZHRtIDwtIERvY3VtZW50VGVybU1hdHJpeChDb3JwdXMpCnRtOjpuVGVybXMoZHRtKQojZWxpbWlubyBsb3MgZG9jdWVtbnRvcyB2YWNpb3MKcm93VG90YWxzIDwtIGFwcGx5KGR0bSAsIDEsIHN1bSkKbkRvY3MoZHRtKQpkdG0gICA8LSBkdG1bcm93VG90YWxzPiAwLCBdCm5Eb2NzKGR0bSkKYGBgCgpgYGB7cn0KCmxkYV9maXQgPC0gTERBKGR0bSwgayA9IDEwLG1ldGhvZCA9ICJHaWJicyIsIGNvbnRyb2wgPSBsaXN0KGRlbHRhPTAuNixzZWVkID0gMTIzNCkpCmxkYV9maXQKYGBgCgpgYGB7cn0KVGVybXMgPC0gdGVybXMobGRhX2ZpdCwgMTApClRlcm1zCmBgYAoKCgpWaXN1YWxpemFjaW9uCgoKCgpgYGB7cn0KdG9waWNtb2RlbHNfanNvbl9sZGF2aXMgPC0gZnVuY3Rpb24oZml0dGVkLCBkdG0pewogICAgc3ZkX3RzbmUgPC0gZnVuY3Rpb24oeCkgdHNuZShzdmQoeCkkdSkKCiAgICAjIEZpbmQgcmVxdWlyZWQgcXVhbnRpdGllcwogICAgcGhpIDwtIGFzLm1hdHJpeChwb3N0ZXJpb3IoZml0dGVkKSR0ZXJtcykKICAgIHRoZXRhIDwtIGFzLm1hdHJpeChwb3N0ZXJpb3IoZml0dGVkKSR0b3BpY3MpCiAgICB2b2NhYiA8LSBjb2xuYW1lcyhwaGkpCiAgICB0ZXJtX2ZyZXEgPC0gc2xhbTo6Y29sX3N1bXMoZHRtKQoKICAgICMgQ29udmVydCB0byBqc29uCiAgICBqc29uX2xkYSA8LSBMREF2aXM6OmNyZWF0ZUpTT04ocGhpID0gcGhpLCB0aGV0YSA9IHRoZXRhLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgdm9jYWIgPSB2b2NhYiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgIG1kcy5tZXRob2QgPSBzdmRfdHNuZSwKICAgICAgICAgICAgICAgICAgICAgICAgICAgIHBsb3Qub3B0cyA9IGxpc3QoeGxhYj0idHNuZSIsIHlsYWI9IiIpLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgZG9jLmxlbmd0aCA9IGFzLnZlY3Rvcih0YWJsZShkdG0kaSkpLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgdGVybS5mcmVxdWVuY3kgPSB0ZXJtX2ZyZXEpCgogICAgcmV0dXJuKGpzb25fbGRhKQp9CmBgYAoKYGBge3J9Cmpzb25fcmVzIDwtIHRvcGljbW9kZWxzX2pzb25fbGRhdmlzKGxkYV9maXQsIGR0bSkKCnNlclZpcyhqc29uX3JlcykKYGBgCgoKCgpgYGB7cn0KdGhldGEgPC0gYXMubWF0cml4KHBvc3RlcmlvcihsZGFfZml0KSR0b3BpY3MpCgp0aGV0YSA8LSBhc190aWJibGUodGhldGEpCgpkaXN0X3RvcGljb3MgPC0gZGZbd2hpY2gocm93VG90YWxzPjApLF0gJT4lICAjdGVuZ28gcXVlIGVsaW1pbmFyIGVzZSBkb2N1ZW1udG8gcXVlIGVzdGFiYSB2YWNpbwogIHNlbGVjdChmZWNoYSx0aXR1bG8pICU+JSAKICBiaW5kX2NvbHModGhldGEpCgpgYGAKCmBgYHtyfQoKZGlzdF90b3BpY29zIDwtIGRpc3RfdG9waWNvcyAlPiUgCiAgc2VsZWN0KC10aXR1bG8pICU+JSAKICBtdXRhdGUoZmVjaGEgPSBmbG9vcl9kYXRlKGZlY2hhLCdtb250aCcpKSAlPiUgCiAgZ3JvdXBfYnkoZmVjaGEpICU+JSAKICBzdW1tYXJpc2VfYWxsKG1lYW4pCmBgYAoKYGBge3J9CmRpc3RfdG9waWNvcyAlPiUgCiAgZ2F0aGVyKHRvcGljbyxwcm9wb3JjaW9uX3Byb21lZGlvLCAyOjExKSAlPiUgCiAgZ2dwbG90KC4sIGFlcyhmZWNoYSxwcm9wb3JjaW9uX3Byb21lZGlvLCBjb2xvcj10b3BpY28pKSArCiAgZ2VvbV9saW5lKCkKYGBgCgo=